Libraries

library(tidyverse)
library(readr)          # import
library(rpart)          # regression trees
library(rpart.plot)     # regression tree plots
library(summarytools)   # summary statistics
library(party)          # ctree
library(partykit)       # ctree
library(caret)
library(forecast)
library(ineq)           # Gini
library(precrec)        # ROC curves
library(corrplot)       # Correlation plots
library(plotly)         # interactive ggplot2 plots :D

Data import

TEXT

# setting the data path
data_path ="./AT2019"

# accessing the data
data19 <- read.csv(file.path(data_path, "p_silc2019_ext.csv"), sep = ";")           # personal data

Data Wrangling

First, we will select and rename the variables of interest. The we will log net income and recode some of the variables.

data19 <- data19 %>%
  
  select(sex, P038004, P110000nu, P111010nu, alter, M009010, M010000, M014000, M016000, M017000, M020010, M021000, M025000, M027000, M028000, M004000, M001300, M001510, M003100, M001100, M001200, M002000, M001500) %>% 
  
  rename("inc_net"           = P038004,     # gross monthly income
         "country_birth"     = P110000nu,   # country of birth of respondent
         "citizenship"       = P111010nu,   # citizenship of respondent
         "age"               = alter,       # age of respondent
         "father_cit"        = M009010,     # citizenship of father at age 14
         "father_edu"        = M010000,     # education of father at age 14 (höchster abschluss)
         "father_occup_stat" = M014000,     # occupational status of father at age 14
         "father_occup"      = M016000,     # main occupation of father at age 14
         "father_manag"      = M017000,     # managerial position of father at age 14            
         "mother_cit"        = M020010,     # citizenship of mother at age 14
         "mother_edu"        = M021000,     # education of mother at age 14
         "mother_occup_stat" = M025000,     # occupational status of mother at age 14
         "mother_occup"      = M027000,     # main occupation of mother at age 14
         "mother_manag"      = M028000,     # managerial position of mother at age 14
         "tenancy"           = M004000,     # tenancy at age 14
         "children"          = M001300,     # number of children (under 18) in respondent’s household at age 14
         "adults"            = M001510,     # number of adults (aged 18 or more) in respondent’s household
         "adults_working"    = M003100,     # number of working adults (aged 18 or more) in respondent’s hhd.
         "father_present"    = M001100,     # father present in respondent’s household at age 14
         "mother_present"    = M001200,     # mother present in respondent’s household at age 14
         "adults_present"    = M001500,     # adults present in respondent’s household at age 14
         ) %>%    
  
  filter(age %in% (27:59), inc_net > 0, mother_present > 0, father_present > 0, father_cit > 0, mother_cit > 0)  %>%
  # We drop all answers where the respondents refused or were not able to provide information 
  # D: dropped man dann nicht die ganzen observations??
  # D und wwenn wir nach age filtern (und nicht die observations mit -6 entfernen) müssen wir erklären warum 27-59 und nicht 25-59 und den text unten ändern, weil ich hab ja einfach die mit -6 (age outside of range) entfernt.
  
  mutate("inc_net_log" = log(inc_net),   
         # logged net income per month of respondent
         
         "both_parents_present" = father_present + mother_present,            
         # 4 = none present, 3 = one present, 2 = both present
         
         sex = factor(ifelse(as.numeric(sex)==2, 1, 0)), 
         # 0 = male, 1 = female
         
         country_birth = factor(country_birth, labels = c(1, 2, 2, 2, 3, 3)), 
         # Austria, EU, Non-EU
         
         father_cit = ifelse(father_cit == 1, 1, 2),                          
         # Austria and Other
         
         mother_cit = ifelse(mother_cit == 1, 1, 2))   
         # Austria and Other

Data Exploration and Visualization

To get a good grasp of our data we will first look at some simple descriptives and a correlaton plot The ad-hoc module on intergenerational transmission of disadvantages only includes “selected respondents aged over 24 years and less than 60 years”. This is why we exclude them: coding ‘-6’ means that year of birth is outside of 1969 and 1994 range or the interview was a proxy interview. Additionally, we exclude all respondents that where not part of this ad-hoc modul even if of the desired age.

Data Summary

print(dfSummary(data19), method="render", style="grid", plain.ascii = F)

Data Frame Summary

data19

Dimensions: 3600 x 25
Duplicates: 1
No Variable Stats / Values Freqs (% of Valid) Graph Valid Missing
1 sex [factor] 1. 0 2. 1
1729(48.0%)
1871(52.0%)
3600 (100.0%) 0 (0.0%)
2 inc_net [integer] Mean (sd) : 1979.5 (972) min < med < max: 60 < 1900 < 13000 IQR (CV) : 1010 (0.5) 621 distinct values 3600 (100.0%) 0 (0.0%)
3 country_birth [factor] 1. 1 2. 2 3. 3
2991(83.1%)
425(11.8%)
184(5.1%)
3600 (100.0%) 0 (0.0%)
4 citizenship [integer] Mean (sd) : 1.3 (0.9) min < med < max: 1 < 1 < 6 IQR (CV) : 0 (0.7)
1:3166(87.9%)
2:129(3.6%)
3:150(4.2%)
4:53(1.5%)
5:36(1.0%)
6:66(1.8%)
3600 (100.0%) 0 (0.0%)
5 age [integer] Mean (sd) : 43.5 (9.1) min < med < max: 27 < 44 < 58 IQR (CV) : 15 (0.2) 32 distinct values 3600 (100.0%) 0 (0.0%)
6 father_cit [numeric] Min : 1 Mean : 1.2 Max : 2
1:2979(82.8%)
2:621(17.2%)
3600 (100.0%) 0 (0.0%)
7 father_edu [integer] Mean (sd) : 3 (2.7) min < med < max: -2 < 2 < 9 IQR (CV) : 2 (0.9) 11 distinct values 3600 (100.0%) 0 (0.0%)
8 father_occup_stat [integer] Mean (sd) : 1 (1.3) min < med < max: -5 < 1 < 4 IQR (CV) : 0 (1.3)
-5:124(3.4%)
-2:24(0.7%)
1:2785(77.4%)
2:595(16.5%)
3:17(0.5%)
4:55(1.5%)
3600 (100.0%) 0 (0.0%)
9 father_occup [integer] Mean (sd) : 4.8 (3.1) min < med < max: -5 < 5 < 9 IQR (CV) : 4 (0.6) 13 distinct values 3598 (99.9%) 2 (0.1%)
10 father_manag [integer] Mean (sd) : 1.2 (1.5) min < med < max: -5 < 2 < 2 IQR (CV) : 1 (1.2)
-5:124(3.4%)
-3:55(1.5%)
-2:81(2.2%)
1:1286(35.7%)
2:2054(57.1%)
3600 (100.0%) 0 (0.0%)
11 mother_cit [numeric] Min : 1 Mean : 1.2 Max : 2
1:2998(83.3%)
2:602(16.7%)
3600 (100.0%) 0 (0.0%)
12 mother_edu [integer] Mean (sd) : 2.7 (2.5) min < med < max: -2 < 2 < 9 IQR (CV) : 3 (0.9) 11 distinct values 3600 (100.0%) 0 (0.0%)
13 mother_occup_stat [integer] Mean (sd) : 2.1 (1.5) min < med < max: -5 < 1 < 4 IQR (CV) : 3 (0.7)
-5:29(0.8%)
-2:10(0.3%)
1:1931(53.6%)
2:272(7.6%)
3:209(5.8%)
4:1149(31.9%)
3600 (100.0%) 0 (0.0%)
14 mother_occup [integer] Mean (sd) : 2.6 (4.3) min < med < max: -5 < 4 < 9 IQR (CV) : 8 (1.7) 13 distinct values 3599 (100.0%) 1 (0.0%)
15 mother_manag [integer] Mean (sd) : 0.1 (2.3) min < med < max: -5 < 2 < 2 IQR (CV) : 5 (22.1)
-5:29(0.8%)
-3:1149(31.9%)
-2:137(3.8%)
1:322(8.9%)
2:1963(54.5%)
3600 (100.0%) 0 (0.0%)
16 tenancy [integer] Mean (sd) : 1.7 (0.6) min < med < max: -3 < 2 < 3 IQR (CV) : 1 (0.4)
-3:21(0.6%)
-2:5(0.1%)
1:968(26.9%)
2:2555(71.0%)
3:51(1.4%)
3600 (100.0%) 0 (0.0%)
17 children [integer] Mean (sd) : 1.1 (0.5) min < med < max: -3 < 1 < 2 IQR (CV) : 0 (0.4)
-3:22(0.6%)
-2:1(0.0%)
1:2964(82.3%)
2:613(17.0%)
3600 (100.0%) 0 (0.0%)
18 adults [integer] Mean (sd) : -2.1 (1.8) min < med < max: -3 < -3 < 7 IQR (CV) : 0 (-0.8)
-3:2903(80.6%)
-2:1(0.0%)
1:410(11.4%)
2:248(6.9%)
3:20(0.6%)
4:13(0.4%)
5:2(0.1%)
6:2(0.1%)
7:1(0.0%)
3600 (100.0%) 0 (0.0%)
19 adults_working [integer] Mean (sd) : 1.8 (1.1) min < med < max: -3 < 2 < 14 IQR (CV) : 1 (0.6) 15 distinct values 3600 (100.0%) 0 (0.0%)
20 father_present [integer] Min : 1 Mean : 1.1 Max : 2
1:3119(86.6%)
2:481(13.4%)
3600 (100.0%) 0 (0.0%)
21 mother_present [integer] Min : 1 Mean : 1 Max : 2
1:3464(96.2%)
2:136(3.8%)
3600 (100.0%) 0 (0.0%)
22 M002000 [integer] Mean (sd) : -2.9 (0.9) min < med < max: -3 < -3 < 8 IQR (CV) : 0 (-0.3)
-3:3526(97.9%)
-2:1(0.0%)
1:32(0.9%)
2:12(0.3%)
3:5(0.1%)
4:2(0.1%)
5:1(0.0%)
6:8(0.2%)
7:11(0.3%)
8:2(0.1%)
3600 (100.0%) 0 (0.0%)
23 adults_present [integer] Mean (sd) : 1.7 (0.8) min < med < max: -3 < 2 < 2 IQR (CV) : 0 (0.5)
-3:22(0.6%)
-2:105(2.9%)
1:697(19.4%)
2:2776(77.1%)
3600 (100.0%) 0 (0.0%)
24 inc_net_log [numeric] Mean (sd) : 7.5 (0.5) min < med < max: 4.1 < 7.5 < 9.5 IQR (CV) : 0.5 (0.1) 621 distinct values 3600 (100.0%) 0 (0.0%)
25 both_parents_present [integer] Mean (sd) : 2.2 (0.4) min < med < max: 2 < 2 < 4 IQR (CV) : 0 (0.2)
2:3057(84.9%)
3:469(13.0%)
4:74(2.1%)
3600 (100.0%) 0 (0.0%)

Generated by summarytools 0.9.8 (R version 4.0.3)
2021-02-14

The data set includes 25 variables and 3600 observation. There are almost no missing values. There are slight more females (52%) than males (48%). The income distribution is skewed to the right meaning the median income is lower than the mean income. 83% of respondents where born in Austria and 88& are Austrian citizens. In about 85% of respondents household at age 14 both parents where present and in 96% the mother was present….

Gini Index and Lorenz curve

ineq(data19$inc_net, type = "Gini")
## [1] 0.2543448

The Gini index is 0.25 which is a bit lower than the World Bank estimate for Austria of 0.3 (2017) available at https://data.worldbank.org/indicator/SI.POV.GINI?locations=AT.

plot(Lc(data19$inc_net), col = "darkred", lwd = 3)

The Gini index corresponds to the are below the the black equal distribution line and above the red line of the actual distribution.

Age pyramide

agepyra <- ggplot(data19, aes(x = age, fill=sex))  + 
  geom_bar(data = subset(data19, sex==1)) +
  geom_bar(data = subset(data19, sex==0), aes(y=..count..*(-1))) + 
  scale_x_continuous(breaks = seq(27,59,2), labels=abs(seq(27,59,2))) +
  scale_fill_manual(name = "Sex", labels = c("Male", "Female"), values=c("springgreen2", "slateblue1")) +
  labs(title = "Age pyramide of ad-hoc module on intergenerational transmission of disadvantages", x = "Age", y = "Number of people") +
  theme_bw() +
  coord_flip()

ggplotly(agepyra)

In the Data Frame Summary above we already saw that there are slightly more females (1) than males (0) in the data set and that the median age is 44 - while the age distribution of the sample is quite evenly distribution there are a bit more older than young individuals.

median(data19$age)
## [1] 44

Correlation plot

data19cor <- data19
data19cor$sex <- as.numeric(data19cor$sex)
data19cor$country_birth <- as.numeric(data19cor$country_birth)

# Dropping the categorical variables father_occup & mother_occup
data19cor <- select(data19cor, -c(father_occup, mother_occup))

# Computing correlation coefficients and significance thereof 
data19cor <- cor(data19cor)
res1 <- cor.mtest(data19cor, conf.level = 0.99)

corrplot(data19cor, method = "ellipse", type = "upper", order = "FPC", diag = FALSE, outline = FALSE, tl.cex = .5, tl.col = "black", title = "Correlation plot", p.mat = res1$p, sig.level = 0.01, insig = "blank", mar=c(2,2,2,2))

As can be seen from the correlation plot, all variables are significantly related to at least one other variable of the data set (at the 1% significance level). For better visibility insignificant correlations are blanked out. As the correlation matrix is ordered using the first principal component there is some clustering of significant correlations.

Regression Tree

set.seed(123)

formula = inc_net_log ~ sex + country_birth + father_cit + father_edu + father_occup_stat + father_occup + father_manag + mother_cit + mother_edu + mother_occup_stat + mother_occup + mother_manag + tenancy + children + adults + adults_working + both_parents_present

data19 <- data19 %>%
  mutate(train_index = sample(c("train", "test"), nrow(data19), replace=TRUE, prob=c(0.80, 0.20)))

train <- data19 %>% filter(train_index=="train")
test <- data19 %>% filter(train_index=="test")
fitControl <- trainControl(method = "repeatedcv", number = 10, repeats = 10, savePredictions = T)

tuning_grid <- expand.grid(cp = seq(0, 0.02, by= 0.005))
tuning_grid
##      cp
## 1 0.000
## 2 0.005
## 3 0.010
## 4 0.015
## 5 0.020
caret_rpart <- train(formula, data = data19, method = "rpart", trControl = fitControl, tuneGrid = tuning_grid, metric = "RMSE", na.action = na.pass)

caret_rpart
## CART 
## 
## 3600 samples
##   17 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times) 
## Summary of sample sizes: 3241, 3240, 3239, 3240, 3239, 3240, ... 
## Resampling results across tuning parameters:
## 
##   cp     RMSE       Rsquared   MAE      
##   0.000  0.5160590  0.1197761  0.3786742
##   0.005  0.4789493  0.1802058  0.3461936
##   0.010  0.4833582  0.1649875  0.3497187
##   0.015  0.4853702  0.1579280  0.3515222
##   0.020  0.4852788  0.1582175  0.3514775
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.005.
tree_caret_final <- caret_rpart$finalModel
rpart.plot(tree_caret_final, box.palette="RdBu", nn=FALSE, type=2)

Conditional inference tree

# For the Inference Tree to work, we must have all variables as numeric data

Ctree <- ctree(formula, data = train, control = ctree_control(testtype = "Bonferroni")) #I think that this already include the Control for inference trees, there is a possibility to do it with CV and caret, but it did not work out yet
Ctree
## 
## Model formula:
## inc_net_log ~ sex + country_birth + father_cit + father_edu + 
##     father_occup_stat + father_occup + father_manag + mother_cit + 
##     mother_edu + mother_occup_stat + mother_occup + mother_manag + 
##     tenancy + children + adults + adults_working + both_parents_present
## 
## Fitted party:
## [1] root
## |   [2] sex in 0
## |   |   [3] country_birth in 1, 2
## |   |   |   [4] mother_cit <= 1
## |   |   |   |   [5] father_edu <= 2
## |   |   |   |   |   [6] mother_edu <= -2: 7.288 (n = 9, err = 6.5)
## |   |   |   |   |   [7] mother_edu > -2: 7.696 (n = 777, err = 100.1)
## |   |   |   |   [8] father_edu > 2: 7.780 (n = 365, err = 73.9)
## |   |   |   [9] mother_cit > 1
## |   |   |   |   [10] father_edu <= 6
## |   |   |   |   |   [11] adults_working <= 0: 7.092 (n = 7, err = 1.7)
## |   |   |   |   |   [12] adults_working > 0: 7.540 (n = 116, err = 9.9)
## |   |   |   |   [13] father_edu > 6: 7.754 (n = 46, err = 12.8)
## |   |   [14] country_birth in 3: 7.313 (n = 71, err = 26.7)
## |   [15] sex in 1
## |   |   [16] mother_edu <= 2
## |   |   |   [17] father_cit <= 1: 7.259 (n = 897, err = 239.5)
## |   |   |   [18] father_cit > 1: 7.089 (n = 191, err = 58.9)
## |   |   [19] mother_edu > 2: 7.376 (n = 398, err = 132.1)
## 
## Number of inner nodes:     9
## Number of terminal nodes: 10
plot(Ctree, type = "simple",gp = gpar(fontsize = 6),
  inner_panel=node_inner,
  ip_args=list(id = FALSE), main = "Conditional Inference Tree for Austria 2019") #Überschrift größe ändern

Instead of the default tuning function we use the caret package for cross validation.

### data = in data 2019 geändert von train!
caret_ctree <- train(formula, data = data19, method = "ctree", trControl = fitControl, na.action = na.pass)
caret_ctree
## Conditional Inference Tree 
## 
## 3600 samples
##   17 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times) 
## Summary of sample sizes: 3240, 3241, 3240, 3240, 3240, 3240, ... 
## Resampling results across tuning parameters:
## 
##   mincriterion  RMSE       Rsquared   MAE      
##   0.01          0.4923692  0.1519835  0.3570260
##   0.50          0.4799440  0.1780711  0.3453309
##   0.99          0.4783652  0.1817853  0.3442732
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mincriterion = 0.99.
caret_ctree_B <- ctree(formula, data = data19, control = ctree_control(testtype = "Bonferroni", mincriterion = 0.99)) 
caret_ctree_B
## 
## Model formula:
## inc_net_log ~ sex + country_birth + father_cit + father_edu + 
##     father_occup_stat + father_occup + father_manag + mother_cit + 
##     mother_edu + mother_occup_stat + mother_occup + mother_manag + 
##     tenancy + children + adults + adults_working + both_parents_present
## 
## Fitted party:
## [1] root
## |   [2] sex in 0
## |   |   [3] country_birth in 1, 2
## |   |   |   [4] mother_cit <= 1
## |   |   |   |   [5] mother_occup_stat <= -2: 7.306 (n = 15, err = 7.1)
## |   |   |   |   [6] mother_occup_stat > -2: 7.725 (n = 1416, err = 209.5)
## |   |   |   [7] mother_cit > 1
## |   |   |   |   [8] father_edu <= 6: 7.518 (n = 152, err = 21.4)
## |   |   |   |   [9] father_edu > 6: 7.820 (n = 54, err = 15.7)
## |   |   [10] country_birth in 3: 7.326 (n = 92, err = 32.6)
## |   [11] sex in 1
## |   |   [12] father_edu <= 2
## |   |   |   [13] father_cit <= 1: 7.253 (n = 1055, err = 290.7)
## |   |   |   [14] father_cit > 1: 7.036 (n = 204, err = 64.0)
## |   |   [15] father_edu > 2: 7.372 (n = 612, err = 169.4)
## 
## Number of inner nodes:    7
## Number of terminal nodes: 8
plot(caret_ctree_B,gp = gpar(fontsize = 6),
  inner_panel=node_inner,
  ip_args=list(abbreviate = FALSE,id = FALSE), main = "Opportunity Conditional Inference Tree for Austria 2019 - Cross Validated")

# Das ist der gleiche Baum wie oben, nur mit anderem Package erstellt. Sieht aber scheiße aus
plot(caret_ctree,gp = gpar(fontsize = 6),
  inner_panel=node_inner,
  ip_args=list(id = FALSE))

plot(caret_ctree$finalModel, type = "simple")

Graphic representation of the tuning parameters

plot(caret_ctree) # RMSE vs p-value our resampling parameter

plot(caret_rpart)

# plotcp(tree_1)

Predictions

test$P_AtCt <- predict(Ctree, newdata = as.data.frame(test))
test$perror <- (test$P_AtCt - test$inc_net_log)^2
test$RMSE <- sqrt(sum((test$P_AtCt - test$inc_net_log)^2/nrow(test), na.rm = T))
head(test$RMSE)
## [1] 0.459181 0.459181 0.459181 0.459181 0.459181 0.459181
plot(test$P_AtCt, test$inc_net_log) #ADD GGPLOT und machs schön!

test$P_AtCt_caret <- predict(caret_rpart, newdata = as.data.frame(test))
test$perror_caret <- (test$P_AtCt_caret - test$inc_net_log)^2
test$RMSE_caret <- sqrt(sum((test$P_AtCt_caret - test$inc_net_log)^2/nrow(test), na.rm = T))
head(test$RMSE_caret)
## [1] 0.4566333 0.4566333 0.4566333 0.4566333 0.4566333 0.4566333

Random Forest

cf <- cforest(formula, data19, na.action = na.pass, control = ctree_control(teststat = "quadratic", testtype = "Bonferroni", mincriterion = 0.99), ytrafo = NULL, scores = NULL, ntree = 500L, perturb = list(replace = FALSE, fraction = 0.8))

hat_cf <- predict(cf, newdata = test, OOB = TRUE, type = "response")

# Calculate the RMSE by hand for cforest and boosted ctree to compare 

varimp(cf, mincriterion = 0, OOB = TRUE) 
##                  sex        country_birth           father_cit 
##         0.0677898397         0.0059506319         0.0056707554 
##           father_edu         father_occup           mother_cit 
##         0.0054289985         0.0028044436         0.0065297354 
##           mother_edu    mother_occup_stat         mother_occup 
##         0.0040748970         0.0009257607         0.0004644832 
##         mother_manag              tenancy             children 
##         0.0017892779        -0.0001953003         0.0004404371 
##               adults       adults_working both_parents_present 
##        -0.0006739003         0.0003050525         0.0005388467
importance_cf <- data.frame(varimp(cf, mincriterion = 0, OOB = TRUE))
names(importance_cf) <- "importance"
importance_cf$var_name = rownames(importance_cf)
importance_cf <- importance_cf  %>% arrange( desc(importance))
varimpo <- ggplot(importance_cf, aes(x = var_name, y = importance)) +
    geom_pointrange(shape = 21, colour = "black", fill = "white", size = 3, stroke = 1, aes(ymin = 0, ymax = importance)) +
    scale_x_discrete(limits = importance_cf$var_name[order(importance_cf$importance)]) +
    labs(title = "Conditional Forest variable importance - Austria 2019", x = "", y = "Mean decrease in sum of squared residuals") +
    coord_flip() +
    theme_light() +
    theme(axis.line = element_blank(), panel.border = element_blank(), panel.grid.major.y=element_blank())

ggplotly(varimpo)

We find that the variable sex, is the single most important variable in determining ones income in Austria. However, sex is not a generationally transmittable circumstance and, while it is a circumstance it is not exactly what we were trying to answer with our exercise. Therefore, we exclude it in the next step and create a new conditional inference forest.

Boosted Inference Tree

# cf_boosted <- blackboost(formula, data = data19, na.action = na.pass, control = boost_control(), tree_controls = partykit::ctree_control())
# cf_boosted
# 
 cf_boosted_2 <- train(formula, data19, method = "ctree2", trControl = fitControl, tuneGrid = NULL, na.action = na.pass)
 cf_boosted_2
## Conditional Inference Tree 
## 
## 3600 samples
##   17 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times) 
## Summary of sample sizes: 3240, 3240, 3240, 3241, 3240, 3240, ... 
## Resampling results across tuning parameters:
## 
##   maxdepth  mincriterion  RMSE       Rsquared   MAE      
##   1         0.01          0.4851689  0.1581313  0.3514751
##   1         0.50          0.4851689  0.1581313  0.3514751
##   1         0.99          0.4851689  0.1581313  0.3514751
##   2         0.01          0.4810092  0.1726096  0.3479951
##   2         0.50          0.4810092  0.1726096  0.3479951
##   2         0.99          0.4810092  0.1726096  0.3479951
##   3         0.01          0.4796566  0.1774570  0.3457560
##   3         0.50          0.4798489  0.1768308  0.3459463
##   3         0.99          0.4792887  0.1784544  0.3458118
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were maxdepth = 3 and mincriterion = 0.99.
 plot(cf_boosted_2$finalModel, type = "simple")

# Calc RMSE!!

Variable Importance